Loading libraries

dir.create(path = results_outpath)
library('SingleCellExperiment')
library('Seurat')
library('dplyr')
library('ggplot2')

Loading data

dat <- read.table(file = data_inpath, sep = ',', stringsAsFactors = FALSE, row.names = 1, header = TRUE)
dat <- Matrix::Matrix(data = as.matrix(dat), sparse = TRUE)
dat <- round(dat) # for some reason, counts are non-integers....
dat_sce <- SingleCellExperiment(assay = list('counts' = dat))
dat <- CreateSeuratObject(counts = dat, project = 'SVZ', assay = 'RNA')
# For SCE objects:
# dat[['sample']] <- sapply(X = strsplit(x = colnames(dat), split = '_'), FUN = `[`, 1)

Basic quality control

tmp <- dat@meta.data
tmp <- tmp[sample(x = nrow(tmp), size = nrow(tmp)),]
plot(x = tmp$nCount_RNA,
     y = tmp$nFeature_RNA,
     xlab = 'total UMI',
     ylab = '# genes detected',
     main = 'Detection rates by sample',
     sub = 'Black = SCtrl; Red = S1d; Green = S7d',
     col = sapply(X = as.character(tmp$orig.ident), FUN = function(x) switch(x, 'SCtrl' = 'black', 'S1d' = 'red3', 'S7d' = 'green')))

From the above, we see that there are a few cells with outlier numbers of total UMI and genes detected. These are likely to be doublets. Beyond that, there are no obvious biases based on sample of origin.

Next we compute gene-level and cell-level QC metrics.

dat <- PercentageFeatureSet(dat, pattern = '^mt-', col.name = 'percent.mt')
dat_sce <- as.SingleCellExperiment(x = dat)
mito_gene <- grep(pattern = '^mt-', x = rownames(dat_sce), value = TRUE)
dat_sce <- scater::calculateQCMetrics(object = dat_sce, exprs_values = 'counts', feature_controls = list('percent.mt' = mito_gene))
tmp <- as.data.frame(colData(dat_sce)[c('log10_total_counts',
                                        'log10_total_features_by_counts',
                                        'pct_counts_in_top_50_features',
                                        'pct_counts_in_top_100_features',                                            
                                        'pct_counts_in_top_200_features',
                                        'pct_counts_in_top_500_features',
                                        'log10_total_counts_percent.mt')])
dat@meta.data <- cbind(dat@meta.data, tmp)
qc_metrics_plot <- dat@meta.data %>%
  reshape2::melt(id.vars = 'orig.ident') %>%
  ggplot(mapping = aes(x = orig.ident, y = value)) +
  geom_violin(scale = 'width') +
  ggbeeswarm::geom_quasirandom(size = 0.25) +
  facet_wrap(. ~ variable, scales = 'free_y') +
  labs(title = 'Quality control metrics') +
  theme(axis.title = element_blank(),
        plot.title = element_text(size = 18, face = 'bold'),
        axis.line.y = element_line(size = 1),
        axis.ticks.y = element_line(size = 1))
qc_metrics_plot

In general, the samples share very similar distributions across quality control metrics. There are few outlier cells (see nCount_RNA i.e. library size). However, we note that the log library sizes (log10_total_counts) appear to have a bimodal distribution. (Note for JSC: previous “raw_counts_good_cells.csv” likely removed the lower mode during QC, which is why the total cells here are much greater than total cells previously, ~650 total). Additionally, we see that the percent of transcripts that map to mitochondrial genes have mean of 15.8%. The outlier cells with mito% greater than 3 median-absolute-deviations (30.4365647) should be considered suspect.

Since we cannot know whether the bimodal distributions are due to low-quality vs high-quality or true biological variation for now we will assume that most cells are of acceptable quality. Under this assumption, we can test two approaches for removing putative low-quality cells. The first is to apply a flat, adjustable threshold to each sample for each QC metric. The second is a multivariate approach via principal component analysis of all the QC metrics.

Flat QC thresholds:

# Designate which QC metrics to base filter
qc_metric <- c('log10_total_features_by_counts',
               'log10_total_counts',
               'pct_counts_in_top_50_features',
               'pct_counts_in_top_100_features',
               'pct_counts_in_top_200_features',
               'pct_counts_in_top_500_features',
               'percent.mt')
# Calculate 4*MAD thresholds (two-way)
qc_threshold <- apply(X = dat@meta.data[qc_metric],
                      MARGIN = 2,
                      FUN = function(x) {
                        hi <- median(x) + 3*mad(x, constant = 1)
                        lo <- median(x) - 3*mad(x, constant = 1)
                        return(c('hi' = hi, 'lo' = lo))
                        })
# Determine which are outliers for each metric
tmp <- sapply(X = qc_metric,
              tmp_data = dat@meta.data,
              tmp_threshold = qc_threshold,
              FUN = function(xx, tmp_data, tmp_threshold) {
                vals <- tmp_data[[xx]]
                hi_thresh <- tmp_threshold['hi', xx]
                lo_thresh <- tmp_threshold['lo', xx]
                outlier <- vals > hi_thresh | vals < lo_thresh
                return(outlier)
              })
# If a cell is above/below thresholds for any metric, consider outlier.
tmp <- apply(X = tmp, MARGIN = 1, FUN = any)
dat$outlier <- tmp
table('sample' = dat$orig.ident, 'is outlier?' = dat$outlier)

# Plot the outlier cells in violin
qc_metrics_plot <- dat@meta.data %>%
  reshape2::melt(id.vars = c('orig.ident','outlier')) %>%
  ggplot(mapping = aes(x = orig.ident, y = value)) +
  geom_violin(scale = 'width') +
  ggbeeswarm::geom_quasirandom(mapping = aes(color = ifelse(outlier, 'True','False')), size = 0.25) +
  scale_color_manual(values = c('True' = 'red', 'False' = 'black')) +
  facet_wrap(. ~ variable, scales = 'free_y') +
  labs(title = 'Quality control metrics') +
  theme(axis.title = element_blank(),
        plot.title = element_text(size = 18, face = 'bold'),
        axis.line.y = element_line(size = 1),
        axis.ticks.y = element_line(size = 1),
        legend.key = element_rect(fill = NA),
        legend.position = 'bottom') +
  guides(color = guide_legend(title = 'Is outlier?'))
qc_metrics_plot

##        is outlier?
## sample  FALSE TRUE
##   S1d     315   69
##   S7d     283  101
##   SCtrl   328   56

Multivariate QC approach:

qc_metric <- c('log10_total_features_by_counts',
                'log10_total_counts',
                'pct_counts_in_top_50_features',
                'pct_counts_in_top_100_features',
                'pct_counts_in_top_200_features',
                'pct_counts_in_top_500_features',
                'percent.mt')
dat_sce <- scater::runPCA(x = dat_sce,
                          use_coldata = TRUE,
                          selected_variables = qc_metric,
                          detect_outliers = TRUE,
                          ncomponents = 2)
plot(dat_sce@reducedDims$PCA_coldata, 
     col = ifelse(dat_sce$outlier, 'red','black'),
     sub = 'Red = discarded outlier; Black = retained')

Gene-level metrics:

scater::plotHighestExprs(dat_sce)

dat_full <- dat
dat <- dat[,dat$outlier == FALSE]

Basic cluster analysis

Now we do a preliminary cluster analysis to check that results are not strongly driven by what we would consider technical variation.

dat <- dat %>%
  NormalizeData() %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA(npcs = 40)
# ElbowPlot(dat, ndims = 40)
dat <- dat %>%
  FindNeighbors(dims = 1:20) %>%
  RunTSNE(dims = 1:20) %>%
  FindClusters()
dat$orig.ident <- factor(dat$orig.ident, levels = c('SCtrl','S1d','S7d'))
# DimPlot(dat, label = TRUE, group.by = 'orig.ident')
plot1 <- DimPlot(dat, group.by = 'orig.ident') + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(), 
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6) + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(),
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot

qc_metric <- c('log10_total_features_by_counts',
                'log10_total_counts',
                'pct_counts_in_top_50_features',
                'pct_counts_in_top_100_features',
                'pct_counts_in_top_200_features',
                'pct_counts_in_top_500_features',
                'percent.mt')
qc_tsne <- vector(mode = 'list', length = length(qc_metric))
names(qc_tsne) <- qc_metric
tmp <- cbind(dat@meta.data, dat[['tsne']]@cell.embeddings)
for(metric in qc_metric) {
  qc_tsne[[metric]] <- tmp %>%
    ggplot(mapping = aes_string(x = 'tSNE_1', y = 'tSNE_2')) +
    geom_point(mapping = aes_string(color = metric)) +
    scale_color_viridis_c() +
    theme(axis.line = element_line(size = 0.5),
          axis.ticks = element_blank(), 
          axis.text = element_blank(),
          axis.title = element_text(size = 10), 
          panel.border = element_rect(color = 'black', fill = NA),
          legend.title = element_blank()) +
    labs(title = metric) +
    guides(color = guide_colorbar(barwidth = 1, 
                                 frame.colour = 'black', 
                                 frame.linewidth = 1, 
                                 ticks.colour = 'black', 
                                 ticks.linewidth = 1))
}
qc_tsne <- cowplot::plot_grid(plotlist = qc_tsne, ncol = round(sqrt(length(qc_metric))))
qc_tsne

From these data, we observe that quality control metrics drive a significant portion of the clustering results. Specifically, cells that would be considered low quality cluster together and comprise clusters 0 and 1. Moving forward, we apply a flat QC threshold for library size and percent.mt based. These threshold values are based on previous scSeq studies.

QC refilter

umi_lo_threshold <- 1e3
log_umi_hi_threshold <- median(dat$log10_total_counts) + 3*mad(x = dat$log10_total_counts, constant = 1)
percent_mt_threshold <- 25

dat <- dat_full
tmp <- dat$nCount_RNA < umi_lo_threshold | dat$log10_total_counts > log_umi_hi_threshold | dat$percent.mt > percent_mt_threshold
dat$outlier <- tmp

# Plot the outlier cells in violin
qc_metrics_plot <- dat@meta.data %>%
  reshape2::melt(id.vars = c('orig.ident','outlier')) %>%
  ggplot(mapping = aes(x = orig.ident, y = value)) +
  geom_violin(scale = 'width') +
  ggbeeswarm::geom_quasirandom(mapping = aes(color = ifelse(outlier, 'True','False')), size = 0.35) +
  scale_color_manual(values = c('True' = 'red', 'False' = 'black')) +
  facet_wrap(. ~ variable, scales = 'free_y') +
  labs(title = 'Quality control metrics') +
  theme(axis.title = element_blank(),
        plot.title = element_text(size = 18, face = 'bold'),
        axis.line.y = element_line(size = 1),
        axis.ticks.y = element_line(size = 1),
        legend.key = element_rect(fill = NA),
        legend.position = 'bottom') +
  guides(color = guide_legend(title = 'Is outlier?'))
qc_metrics_plot

dat_full <- dat
dat <- dat[,dat$outlier == FALSE]

We repeat the basic cluster analysis.

firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  return(x)
}
cc_genes <- firstup(tolower(readLines(con = cc_genes_path)))
s.genes <- intersect(cc_genes[1:43], rownames(dat))
g2m.genes <- intersect(cc_genes[44:97], rownames(dat))
dat <- dat %>%
  NormalizeData() %>%
  FindVariableFeatures() %>%
  CellCycleScoring(s.features = s.genes, g2m.features = g2m.genes)
dat$CC.difference <- dat$S.Score - dat$G2M.Score
dat <- dat %>%
  ScaleData(vars.to.regress = 'CC.difference') %>%
  RunPCA(npcs = 40) %>%
# ElbowPlot(dat, ndims = 40)
  FindNeighbors(dims = 1:15) %>%
  RunTSNE(dims = 1:15) %>%
  FindClusters()
dat$orig.ident <- factor(dat$orig.ident, levels = c('SCtrl','S1d','S7d'))
# DimPlot(dat, label = TRUE, group.by = 'orig.ident')
plot1 <- DimPlot(dat, group.by = 'orig.ident') + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(), 
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6) + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(),
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot

qc_metric <- c('log10_total_features_by_counts',
               'log10_total_counts',
               'pct_counts_in_top_200_features',
               'percent.mt')
qc_tsne <- vector(mode = 'list', length = length(qc_metric))
names(qc_tsne) <- qc_metric
tmp <- cbind(dat@meta.data, dat[['tsne']]@cell.embeddings)
for(metric in qc_metric) {
  qc_tsne[[metric]] <- tmp %>%
    ggplot(mapping = aes_string(x = 'tSNE_1', y = 'tSNE_2')) +
    geom_point(mapping = aes_string(color = metric)) +
    scale_color_viridis_c() +
    theme(axis.line = element_line(size = 0.5),
          axis.ticks = element_blank(), 
          axis.text = element_blank(),
          axis.title = element_text(size = 10), 
          panel.border = element_rect(color = 'black', fill = NA),
          legend.title = element_blank()) +
    labs(title = metric) +
    guides(color = guide_colorbar(barwidth = 1, 
                                 frame.colour = 'black', 
                                 frame.linewidth = 1, 
                                 ticks.colour = 'black', 
                                 ticks.linewidth = 1))
}
qc_tsne <- cowplot::plot_grid(plotlist = qc_tsne, ncol = round(sqrt(length(qc_metric))))
qc_tsne

Linear Dimensional Reduction

plot1 <- ElbowPlot(dat, ndims = 40)
dat <- JackStraw(object = dat, reduction = 'pca', dims = 40)
dat <- ScoreJackStraw(object = dat, reduction = 'pca', dims = 1:40)
plot2 <- JackStrawPlot(object = dat, dims = 1:40)
plot <- cowplot::plot_grid(plot1, plot2, ncol = 1, rel_heights = c(0.5,1))
plot

npcs <- which(dat[['pca']]@jackstraw$overall.p.values[,2] < 1e-10)
dat <- RunTSNE(object = dat, dims = npcs)

total_var <- sum(matrixStats::rowVars(x = dat[['RNA']]@scale.data))
pc_ev <- dat[['pca']]@stdev^2
explained_var <- round(pc_ev/total_var * 100, digits = 1)

tmp <- cbind(dat[['pca']]@cell.embeddings, dat@meta.data)
tmp_plots <- vector(mode = 'list', length = length(npcs))
for(i in 1:(length(npcs)-1)) {
  tmp_plots[[i]] <- vector(mode = 'list', length = length(npcs))
  for(j in (i+1):length(npcs)) {
    dim_i <- paste0('PC_', npcs[i])
    dim_j <- paste0('PC_', npcs[j])
    tmp_plots[[i]][[j]] <- tmp %>%
      ggplot(mapping = aes_string(y = dim_i, x = dim_j)) +
      geom_point(mapping = aes(color = orig.ident), size = 1) +
      geom_hline(yintercept = 0, linetype = 'dashed') +
      geom_vline(xintercept = 0, linetype = 'dashed') +
      theme(panel.background = element_rect(fill = NA, color = 'black'),
            panel.grid = element_blank(),
            axis.title = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            legend.position = 'none')
  }
  tmp_plots[[i]][[i]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0(dim_i, '\n', explained_var[npcs[i]]))
  tmp_plots[[i]] <- cowplot::plot_grid(plotlist = tmp_plots[[i]], ncol = length(tmp_plots[[i]]))
}
# tmp_plots[[length(npcs)]] <- vector(mode = 'list', length = length(npcs) + 1)
tmp_plots[[length(npcs)]][[length(npcs)]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0('PC_', npcs[length(npcs)], '\n', explained_var[npcs[length(npcs)]]))
tmp_plots[[length(npcs)]] <- cowplot::plot_grid(plotlist = tmp_plots[[length(npcs)]], ncol = length(tmp_plots[[length(npcs)]]))
tmp_plots <- cowplot::plot_grid(plotlist = tmp_plots, ncol = 1)
tmp <- tmp %>%
      ggplot(mapping = aes_string(x = dim_i, y = dim_j)) +
      geom_point(mapping = aes(color = orig.ident), size = 1) +
      geom_hline(yintercept = 0, linetype = 'dashed') +
      geom_vline(xintercept = 0, linetype = 'dashed') +
      theme(panel.background = element_rect(fill = NA, color = 'black'),
            panel.grid = element_blank(),
            axis.title = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            legend.key = element_rect(color = NA, fill = NA)) +
  guides(color = guide_legend(title = 'Sample', override.aes = list(size = 4)))
tmp_legend <- cowplot::get_legend(tmp)
tmp_plots <- cowplot::plot_grid(tmp_plots, tmp_legend, rel_widths = c(1, 0.05))
tmp_plots

# distance matrix in PCA-space
dat_dist <- dist(x = dat[['pca']]@cell.embeddings[,npcs])

# Assign values of k.param to iterate
k_range <- seq(from = 5, to = 35, by = 5) # sqrt(ncol(dat)) == 25.6907
names(k_range) <- paste('k', k_range, sep ='_')
k_results <- vector(mode = 'list', length = length(k_range))
names(k_results) <- names(k_range)

# Cluster data through k.params using a set louvain resolution
for(k in 1:length(k_range)) {
  tmp <- FindNeighbors(object = dat, k.param = k_range[k], dims = npcs)
  tmp <- FindClusters(tmp, resolution = 0.8, algorithm = 3)
  k_results[[names(k_range)[k]]] <- tmp@active.ident
}
k_results <- do.call(cbind, k_results)

# Compute silhouette coefficients for clustering results
silhouette_val <- vector(mode = 'list', length = ncol(k_results))
names(silhouette_val) <- colnames(k_results)
for(k in 1:ncol(k_results)) {
  silhouette_val[[colnames(k_results)[k]]] <- cluster::silhouette(x = k_results[,k], dist = dat_dist)
}

# Plot silhouette coefficients
tiff(filename = paste0(results_outpath, 'SilhouettePlots_kparamTesting.tiff'), height = 2400, width = 2440, res = 220)
plot_dim <- ceiling(sqrt(length(silhouette_val)))
{
  par(mfrow = c(plot_dim, plot_dim))
  for(k in 1:length(silhouette_val)) {
    cat(plot(silhouette_val[[k]], border = NA, main = paste(names(silhouette_val)[k], '; res = 0.8'), cex = 1.5))
  }
}
dev.off()
dat <- FindNeighbors(object = dat, dims = npcs, k.param = 15)

Now iterate through various louvain resolutions.

# distance matrix in PCA-space
dat_dist <- dist(x = dat[['pca']]@cell.embeddings[,npcs])

res_range <- c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
names(res_range) <- paste('res', res_range, sep ='_')
res_results <- vector(mode = 'list', length = length(res_range))
names(res_results) <- names(res_range)

# Cluster data through k.params using a set louvain resolution
for(r in 1:length(res_range)) {
  tmp <- FindClusters(dat, resolution = res_range[r], algorithm = 3)
  res_results[[names(res_range)[r]]] <- tmp@active.ident
}
res_results <- do.call(cbind, res_results)

# Compute silhouette coefficients for clustering results
silhouette_val <- vector(mode = 'list', length = ncol(res_results))
names(silhouette_val) <- colnames(res_results)
for(r in 1:ncol(res_results)) {
  silhouette_val[[colnames(res_results)[r]]] <- cluster::silhouette(x = res_results[,r], dist = dat_dist)
}

# Plot silhouette coefficients
tiff(filename = paste0(results_outpath, 'SilhouettePlots_resTesting.tiff'), height = 2400, width = 2440, res = 220)
plot_dim <- ceiling(sqrt(length(silhouette_val)))
{
  par(mfrow = c(ceiling(length(res_range)/plot_dim), plot_dim))
  for(r in 1:length(silhouette_val)) {
    cat(plot(silhouette_val[[r]], border = NA, main = paste('k.param = 15;', names(silhouette_val)[r]), cex = 1.5))
  }
}
dev.off()
dat <- FindNeighbors(object = dat, dims = npcs, k.param = 15)
dat <- FindClusters(object = dat, resolution = 0.8)
dat <- RunTSNE(object = dat)
plot1 <- DimPlot(dat, group.by = 'orig.ident', reduction = 'tsne', pt.size = 2) + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(), 
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6, reduction = 'tsne', pt.size = 2) + 
  theme(axis.line = element_line(size = 0.5),
        axis.ticks = element_blank(), 
        axis.text = element_blank(),
        axis.title = element_text(size = 10), 
        panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot

total_var <- sum(matrixStats::rowVars(x = dat[['RNA']]@scale.data))
pc_ev <- dat[['pca']]@stdev^2
explained_var <- round(pc_ev/total_var * 100, digits = 1)

color_by <- 'seurat_clusters'
tmp <- cbind(dat[['pca']]@cell.embeddings, dat@meta.data)
tmp_plots <- vector(mode = 'list', length = length(npcs))
for(i in 1:(length(npcs)-1)) {
  tmp_plots[[i]] <- vector(mode = 'list', length = length(npcs))
  for(j in (i+1):length(npcs)) {
    dim_i <- paste0('PC_', npcs[i])
    dim_j <- paste0('PC_', npcs[j])
    tmp_plots[[i]][[j]] <- tmp %>%
      ggplot(mapping = aes_string(y = dim_i, x = dim_j)) +
      geom_point(mapping = aes_string(color = color_by), size = 1) +
      geom_hline(yintercept = 0, linetype = 'dashed') +
      geom_vline(xintercept = 0, linetype = 'dashed') +
      theme(panel.background = element_rect(fill = NA, color = 'black'),
            panel.grid = element_blank(),
            axis.title = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            legend.position = 'none')
  }
  tmp_plots[[i]][[i]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0(dim_i, '\n', explained_var[npcs[i]]))
  tmp_plots[[i]] <- cowplot::plot_grid(plotlist = tmp_plots[[i]], ncol = length(tmp_plots[[i]]))
}
# tmp_plots[[length(npcs)]] <- vector(mode = 'list', length = length(npcs) + 1)
tmp_plots[[length(npcs)]][[length(npcs)]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0('PC_', npcs[length(npcs)], '\n', explained_var[npcs[length(npcs)]]))
tmp_plots[[length(npcs)]] <- cowplot::plot_grid(plotlist = tmp_plots[[length(npcs)]], ncol = length(tmp_plots[[length(npcs)]]))
tmp_plots <- cowplot::plot_grid(plotlist = tmp_plots, ncol = 1)
tmp <- tmp %>%
      ggplot(mapping = aes_string(x = dim_i, y = dim_j)) +
      geom_point(mapping = aes_string(color = color_by), size = 1) +
      geom_hline(yintercept = 0, linetype = 'dashed') +
      geom_vline(xintercept = 0, linetype = 'dashed') +
      theme(panel.background = element_rect(fill = NA, color = 'black'),
            panel.grid = element_blank(),
            axis.title = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            legend.key = element_rect(color = NA, fill = NA)) +
  guides(color = guide_legend(title = 'Cluster', override.aes = list(size = 4)))
tmp_legend <- cowplot::get_legend(tmp)
tmp_plots <- cowplot::plot_grid(tmp_plots, tmp_legend, rel_widths = c(1, 0.05))
tmp_plots

markers <- FindAllMarkers(object = dat, assay = 'RNA', slot = 'data', test.use = 'wilcox', only.pos = TRUE)
saveRDS(object = markers, file = paste0(results_outpath, 'graph_de_markers.rds'))

Cluster DE genes

These are globally DE genes i.e. each cluster vs all other cells combined. DE is calculated using a Wilcoxon Rank sum test, which is a conservative approach (low false-positives but high false-negatives).

markers <- readRDS(file = paste0(results_outpath, 'graph_de_markers.rds'))
markers <- markers[c(6,7,2,5,1,3,4)]
markers[c('p_val_adj','avg_logFC','p_val','pct.1','pct.2')] <- signif(markers[c('p_val_adj','avg_logFC','p_val','pct.1','pct.2')]
, digits = 3)
DT::datatable(data = markers, rownames = FALSE, extensions = c('Scroller','Buttons'), options = list(scroller = TRUE, deferRender = TRUE, scrollY = 350, fixedColumns = TRUE, autoWidth = TRUE, pageLength = 10, dom = 'Bfrtip', buttons = list(extend = 'collection', buttons = c('csv', 'excel', 'pdf'))))

Heatmap of the top 7 DE genes for each cluster. Some clusters have overlapping globally-DE genes (e.g. cluster 4 with clusters 0 and 6).

markers <- readRDS(file = paste0(results_outpath, 'graph_de_markers.rds'))
ngenes <- 7
marker.genes <- c()
for(c in levels(markers$cluster)) {
  tmp_results <- markers[markers$cluster == c & markers$p_val_adj < 1e-03,]
  tmp_genes <- tmp_results[order(tmp_results$avg_logFC, decreasing = TRUE),]$gene
  genes_index <- which(!(tmp_genes %in% marker.genes) & tmp_genes != 'AA467197')[1:ngenes]
  marker.genes <- c(marker.genes, tmp_genes[genes_index])
}
names(marker.genes) <- rep(levels(markers$cluster), each = ngenes)

Idents(dat) <- 'seurat_clusters'
DefaultAssay(dat) <- 'RNA'
tmp <- ScaleData(dat, features = marker.genes)
avg.exp <- FetchData(tmp, vars = c('seurat_clusters', marker.genes), slot = 'scale.data') %>%
  reshape2::melt(id.vars = c('seurat_clusters')) %>%
  group_by(seurat_clusters, variable) %>%
  summarise(avg.exp = mean(value))

for(c in levels(markers$cluster)) {
  tmp <- avg.exp[avg.exp$seurat_clusters == c & avg.exp$variable %in% marker.genes[names(marker.genes) == c],]
  tmp <- tmp[order(tmp$avg.exp, decreasing = TRUE),]$variable
  marker.genes[names(marker.genes) == c] <- as.character(tmp)
}
avg.exp <- avg.exp %>% mutate(variable = factor(variable, levels = rev(marker.genes)))

paletteLength <- 200
max.z <- 2
myColor <- rev(colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = 'RdBu'))(paletteLength))
myBreaks <- c(seq(min(avg.exp$avg.exp), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(min(max.z, max(avg.exp$avg.exp))/paletteLength, min(max.z, max(avg.exp$avg.exp)), length.out=floor(paletteLength/2)))
marker.heatmap <- avg.exp %>%
  ggplot(mapping = aes(x = seurat_clusters, y = variable, fill = avg.exp)) +
  geom_tile(color = 'black', size = 0.25) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  scale_fill_gradientn(colors = myColor, 
                     values = scales::rescale(x = myBreaks, to = c(0,1)),
                      limits = c(NA, max.z), na.value = myColor[paletteLength], breaks = seq(-5,5,0.5)) +
  theme(axis.text.x = element_text(size = 12),
        axis.title = element_blank()) +
  guides(fill = guide_colorbar(title = 'Expression\nz-score', barwidth = 1.25, frame.colour = 'black', frame.linewidth = 1, ticks.colour = 'black', ticks.linewidth = 1))

marker.heatmap

ggsave(filename = paste0(results_outpath, 'graph_cluster_markers_heatmap.tiff'), plot = marker.heatmap, device = 'tiff', height = 9, width = 4)

Cluster dynamics

Next we explore the population changes for each of the clusters between the 3 conditions. We calculate the absolute numbers of cells per cluster as well as the proportion out of all cells for each condition.

counts <- data.frame(table(dat$seurat_clusters, dat$orig.ident))
names(counts) <- c('seurat_clusters','orig.ident','count')
numbers <- counts %>%
  ggplot(mapping = aes(x = orig.ident, y = count, group = seurat_clusters)) + 
  geom_bar(aes(fill = seurat_clusters), stat = 'identity', color = 'black', size = 0.75) +
  scale_y_continuous() +
  scale_x_discrete() + 
  ylab('# of cells') + 
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 12), 
        axis.text.x = element_text(size = 12), 
        axis.line = element_line(size = 1),
        panel.background = element_rect(fill = NA),
        axis.text.y = element_text(size = 12))
props <- counts %>% 
  ggplot(mapping = aes(x = orig.ident, y = count, group = seurat_clusters)) + 
  geom_bar(aes(fill = seurat_clusters), stat = 'identity', color = 'black', size = 0.75, position = 'fill') +
  scale_y_continuous() + 
  scale_x_discrete() +
  ylab('% of cells') + 
  theme(axis.title.x = element_blank(), 
        axis.title.y = element_text(size = 12), 
        axis.text.x = element_text(size = 12),
        axis.line = element_line(size = 1), 
        panel.background = element_rect(fill = NA), 
        axis.text.y = element_text(size = 12), 
        legend.title = element_blank(), 
        legend.text = element_text(size = 12))
legend <- cowplot::get_legend(plot = props)
pop.dynamics <- cowplot::plot_grid(numbers + theme(legend.position = 'none'), props + theme(legend.position = 'none'), legend, ncol = 3, rel_widths = c(1, 1, 0.25))

pop.dynamics

ggsave(filename = paste0(results_outpath, 'population_dynamics.tiff'), plot = pop.dynamics, device = 'tiff', height = 3.75, width = 8)
sessionInfo()